library(tidyverse)
── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.2 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1 ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(janitor)
Attaching package: ‘janitor’
The following objects are masked from ‘package:stats’:
chisq.test, fisher.test
steam <- read_csv("raw_data/steam_checkpoint.csv")
Rows: 26564 Columns: 22── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (8): name, developer, publisher, multiplayer, categories, genres, steamspy_tags, owners
dbl (8): appid, required_age, achievements, positive_ratings, negative_ratings, average_playtime, median_playtime, price
lgl (5): free_to_play, virtual_reality_support, windows_support, mac_support, linux_support
date (1): release_date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Where was i?
games_pre_cleaned <- read_csv("raw_data/Cleaned Data 2 GVGS&R.csv")
Rows: 6894 Columns: 15── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (5): Name, Genre, Publisher, Developer, Rating
dbl (10): Year_of_Release, NA_Sales, EU_Sales, JP_Sales, Other_Sales, Global_Sales, Critic_Score, Critic_Count, User_Score, User_Count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
games_pre_cleaned
How dare you do the work for me
games_raw <- read_csv("raw_data/Raw Data GVGS&R.csv")
Rows: 16719 Columns: 16── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (8): Name, Platform, Year_of_Release, Genre, Publisher, User_Score, Developer, Rating
dbl (8): NA_Sales, EU_Sales, JP_Sales, Other_Sales, Global_Sales, Critic_Score, Critic_Count, User_Count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
games_raw %>%
filter(Name == "Minecraft")
Why does everyone have the same dataset
back to steam i guess
steam
steam %>%
distinct(genres)
Already have F2P covered by another column, so we can drop that in
genres
steam %>%
mutate(genres = str_remove_all(genres, "Free to Play")) %>%
filter(free_to_play == TRUE)
Is this worth doing? Duplicated info doesn’t hurt i suppose - leave
it for now
single player?
steam <- steam %>%
mutate(singleplayer = case_when(
str_detect(categories, "Single-player") ~ TRUE,
str_detect(steamspy_tags, "Singleplayer") ~ TRUE,
TRUE ~ FALSE
),.before = multiplayer)
steam %>%
filter(free_to_play == TRUE)
steam
# adding column to simplify ratings
steam <- steam %>%
mutate(total_reviews = positive_ratings + negative_ratings) %>%
arrange(desc(total_reviews)) %>%
mutate(percent_positive_reviews = positive_ratings / total_reviews) %>%
mutate(general_rating = case_when(
total_reviews > 1000 & percent_positive_reviews >= 0.95 ~ "Extremely Positive",
total_reviews > 1000 & percent_positive_reviews >= 0.75 ~ "Positive",
total_reviews > 1000 & percent_positive_reviews >= 0.55 ~ "Mostly Positive",
total_reviews > 1000 & percent_positive_reviews >= 0.50 ~ "Mixed" ,
total_reviews > 1000 & percent_positive_reviews >= 0.35 ~ " Mostly Negative",
total_reviews > 1000 & percent_positive_reviews >= 0.15 ~ "Negative",
total_reviews > 1000 & percent_positive_reviews < 0.15 ~ "Extremely Negative",
positive_ratings == 0 ~ "No Positive Reviews",
negative_ratings == 0 ~ "No Negative Reviews",
positive_ratings & negative_ratings == 0 ~ "No Reviews",
total_reviews < 1000 ~ "Not Enough Reviews",
TRUE ~ "No Data"
),.before = positive_ratings) %>%
select(-percent_positive_reviews)
backloggd <- read_csv("clean_data/backloggd_clean.csv")
Rows: 1115 Columns: 15── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (9): title, team, genre_tag, genre_tag_2, genre_tag_3, genre_tag_4, genre_tag_5, genre_tag_6, genre_tag_7
dbl (5): years_since_release, rating, number_of_reviews, wishlist, plays
date (1): release_date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
backloggd
games_raw <- games_raw %>%
clean_names()
# Platform isn't a genre - correcting
games_raw <- games_raw %>%
mutate(genre = case_when(
genre == "Platform" ~ "Platformer",
TRUE ~ genre
))
nintendo_published <- games_raw %>%
filter(publisher == "Nintendo")
genre_publisher_sales <- games_raw %>%
group_by(publisher, genre) %>%
summarise(sum(global_sales)) %>%
arrange(desc(`sum(global_sales)`))
`summarise()` has grouped output by 'publisher'. You can override using the `.groups` argument.
write_csv(genre_publisher_sales, "clean_data/genre_by_publisher.csv")
genre_publisher_sales %>%
filter(publisher == "From Software")
games_raw %>%
distinct(rating)
# clarifying that the rating is for ESRP - the american system
games_raw <- games_raw %>%
rename("esrp" = "rating")
# Adding PEGI equivalent
games_raw <- games_raw %>%
mutate(pegi_equivalent = case_when(
esrp == "EC" ~ "3",
esrp == "E" ~ "7",
esrp == "E10+" ~ "12",
esrp == "T" ~ "16",
esrp == "M" ~ "18",
esrp == "AO" ~ "18",
esrp == "K-A" ~ "7",
esrp == "RP" ~ "No Rating",
TRUE ~ "No Rating Found"
))
total_sales_by_genre <- games_raw %>%
group_by(genre) %>%
summarise(sum(global_sales)) %>%
rename("total_sales" = "sum(global_sales)") %>%
arrange(desc(total_sales))
total_sales_by_genre <- total_sales_by_genre %>%
drop_na()
games_raw
breakdown_sales_by_genre <- games_raw %>%
group_by(genre) %>%
summarise(sum(na_sales), sum(eu_sales), sum(jp_sales), sum(other_sales)) %>%
rename("total_na_sales"= "sum(na_sales)", "total_eu_sales" = "sum(eu_sales)", "total_jp_sales" = "sum(jp_sales)", "total_other_sales" = "sum(other_sales)")
total_sales_by_genre # this seems to have some rounding differences compared to breakdown, so i'll retire this one
how does the sales for the top 3 genres look in each region?
breakdown_sales_by_genre <- breakdown_sales_by_genre %>%
group_by(genre) %>%
mutate(total_sales = sum(total_na_sales + total_eu_sales + total_jp_sales + total_other_sales)) %>%
drop_na()
breakdown_sales_by_genre %>%
filter(genre %in% c("Action", "Sports", "Shooter")) %>%
ggplot(aes(x = total_sales, y = genre))+
geom_col() +
geom_col(aes(x = total_na_sales), fill = "green")+
geom_col(aes(x = total_eu_sales), fill = "red") +
geom_col(aes(x = total_jp_sales), fill = "blue")

# this looks shit
What about trends? How did each genres sales adjust every 5 or so
years?
games_raw %>%
filter(year_of_release == "2016")
games_raw %>%
mutate(year_of_release = as.integer(year_of_release)) %>%
distinct(year_of_release) %>%
arrange(year_of_release)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `year_of_release = as.integer(year_of_release)`.
Caused by warning:
! NAs introduced by coercion
sales_2019 <- read_csv("raw_data/sales-2019.csv")
Rows: 55792 Columns: 23── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (10): Name, basename, Genre, ESRB_Rating, Platform, Publisher, Developer, Last_Update, url, img_url
dbl (12): Rank, Critic_Score, User_Score, Total_Shipped, Global_Sales, NA_Sales, PAL_Sales, JP_Sales, Other_Sales, Year, status, Vgchartzscore
lgl (1): VGChartz_Score
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sales_2019 %>%
distinct(Year) %>%
arrange(desc(Year))
sales_2019 %>%
filter(Year == "2019") %>%
select(Name, Year, Platform)
# data for 2020 is total nonsense. Some information in 2019 is incorrect.
Going off the fact that there is the majority of usable information
i am able to get is from 2019 or before, we’re just going to pretend
we’re living in an
alternative 2019
As such, I am dropping all data from 2020 in this dataset
sales_2019 <- sales_2019 %>%
filter(Year != "2020")
Looking at backloggd again
backloggd
# creating a range_rating column to reduce noise when it comes to plotting results
backlogged <- backloggd %>%
mutate(rating_range = case_when(
rating < 5 & rating >= 4 ~ "4+",
rating < 4 & rating >= 3 ~ "3 to 4",
rating < 3 & rating >= 2 ~ "2 to 3",
rating < 2 & rating >= 1 ~ "1 to 2",
rating < 1 ~ ">1",
TRUE ~ "No rating"
))
DIFFICULTIES WITH THE DATA
After having little success attempting to get a more up to date data
set, i’ve decided to stick with what I already have - Given more time, i
would have liked to have run the steam/steam spy script to scrape all
the steam data, but unfortunately i couldn’t commit 3 days to it. There
doesn’t seem to be much in the way of modern sales data, with most
companies not releasing figures unless its to do with a milestone.
Genres are quite vague - While it would be nice to have tags like “Open
World”, “Rouge-like/lite”, we instead mostly have “Action” “Adventure”,
“Indie”, which are not hugely descriptive. This might make recommending
a genre to work on harder
DAY 3
sales_2019 %>%
summary()
Rank Name basename Genre ESRB_Rating Platform Publisher Developer VGChartz_Score Critic_Score
Min. : 1 Length:54482 Length:54482 Length:54482 Length:54482 Length:54482 Length:54482 Length:54482 Mode:logical Min. : 1.00
1st Qu.:13638 Class :character Class :character Class :character Class :character Class :character Class :character Class :character NA's:54482 1st Qu.: 6.40
Median :27421 Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Median : 7.50
Mean :27465 Mean : 7.21
3rd Qu.:41202 3rd Qu.: 8.30
Max. :55791 Max. :10.00
NA's :47952
User_Score Total_Shipped Global_Sales NA_Sales PAL_Sales JP_Sales Other_Sales Year Last_Update url status
Min. : 2.00 Min. : 0.03 Min. : 0.00 Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.00 Min. :1970 Length:54482 Length:54482 Min. :1
1st Qu.: 7.80 1st Qu.: 0.20 1st Qu.: 0.03 1st Qu.:0.05 1st Qu.:0.01 1st Qu.:0.02 1st Qu.:0.00 1st Qu.:2000 Class :character Class :character 1st Qu.:1
Median : 8.50 Median : 0.59 Median : 0.12 Median :0.12 Median :0.04 Median :0.05 Median :0.01 Median :2008 Mode :character Mode :character Median :1
Mean : 8.27 Mean : 1.89 Mean : 0.37 Mean :0.28 Mean :0.16 Mean :0.11 Mean :0.04 Mean :2006 Mean :1
3rd Qu.: 9.20 3rd Qu.: 1.80 3rd Qu.: 0.36 3rd Qu.:0.29 3rd Qu.:0.14 3rd Qu.:0.12 3rd Qu.:0.04 3rd Qu.:2011 3rd Qu.:1
Max. :10.00 Max. :82.86 Max. :20.32 Max. :9.76 Max. :9.85 Max. :2.69 Max. :3.12 Max. :2019 Max. :1
NA's :54154 NA's :52661 NA's :35132 NA's :41528 NA's :41334 NA's :47458 NA's :39005
Vgchartzscore img_url
Min. :2.60 Length:54482
1st Qu.:6.90 Class :character
Median :7.90 Mode :character
Mean :7.49
3rd Qu.:8.50
Max. :9.60
NA's :53726
All of the VGchartz scores are empty, so it can go. Also, same as
above, there’s no such genre as Platform
1970 is nonsense. dropping entire year
sales_2019 %>%
group_by(five_year_period) %>%
count(genre) %>%
slice_max(n = 2, n) %>%
ggplot(aes(x = five_year_period, y = n, fill = genre)) +
geom_col(colour = "black") +
labs(
title = "Most Common Genres Over Time",
subtitle = "Shown over 5 year intervals",
caption = "From 2019 games sales data"
) +
scale_fill_manual(values = c(
"Platformer" = "#04E762",
"Action" = "#DC0073",
"Sports" = "#008BF8",
"Misc" = "#f5b700",
"Shooter" = "#390099"
)) +
theme_light() +
theme(axis.text.x = element_text(angle = 25, hjust = 0.5, size = 9, face = "bold", vjust = 0.8)) +
theme(axis.line = element_line(colour = "black"))+
theme(axis.text.y = element_text(size = 9, face = "bold")) +
xlab("Interval") +
ylab("Number of Games in the genre released") +
theme(axis.title.x = element_text(face = "italic", size = 10)) +
theme(axis.title.y = element_text(face = "italic", size = 10)) +
theme(plot.caption = element_text(size = 7, hjust = 1.25)) +
labs(fill = "Genre")

# Misc isnt very descriptive or helpful - in terms of recommending making something to someone it means essentially "try anything"
# PLOT SHOWING MOST COMMON GENRE OF GAMES - NOT NECESSARILY BEST SELLING
sales_2019 %>%
group_by(five_year_period) %>%
filter(genre != "Misc") %>%
count(genre) %>%
slice_max(n = 2, n) %>%
ggplot(aes(x = five_year_period, y = n, fill = genre)) +
geom_col(colour = "black") +
labs(
title = "Most Common Genres Over Time",
subtitle = "Shown over 5 year intervals - Excluding 'Misc' genre",
caption = "From 2019 game sales data"
) +
scale_fill_manual(values = c(
"Platformer" = "#04E762",
"Action" = "#DC0073",
"Sports" = "#008BF8",
# "Misc" = "#f5b700",
"Shooter" = "#390099",
"Adventure" = "#8ac926",
"Role-Playing" = "#ff6b35"
)) +
theme_light() +
theme(axis.text.x = element_text(angle = 25, hjust = 0.5, size = 9, face = "bold", vjust = 0.8)) +
theme(axis.line = element_line(colour = "black"))+
theme(axis.text.y = element_text(size = 9, face = "bold")) +
xlab("Interval") +
ylab("Number of Games in the genre released") +
theme(axis.title.x = element_text(face = "italic", size = 10)) +
theme(axis.title.y = element_text(face = "italic", size = 10)) +
theme(plot.caption = element_text(size = 7, hjust = 1.25)) +
labs(fill = "Genre")

NA
This is slightly concerning - a lot of these titles could easily have
a genre that isn’t the cure-all “Misc” - Include both graphs - one with
misc, one excluding it
What about a similar plot for steam games?
steam_years <- steam_years %>%
separate(col = genres, sep = ";", into = paste0("Genre", seq_len(max(nchar(.$genres))))) %>% # this is absolutely horrendous, but it gets me all the genres into separate columns so whatever
select(-Genre14:-Genre180) %>%
pivot_longer(cols = Genre1:Genre13, names_to = "genres") %>%
drop_na(value) %>% # ok now we can make the year intervals and group them to find what genres were popular
mutate(year_range = case_when(
release_year <= 2019 & release_year >= 2016 ~ "2016 - 2019",
release_year <= 2015 & release_year >= 2012 ~ "2012 - 2015",
release_year <= 2011 & release_year >= 2009 ~ "2009 - 2011",
release_year <= 2008 & release_year >= 2005 ~ "2005 - 2008",
release_year <= 2004 & release_year>= 2000 ~ "2000 - 2004",
release_year < 2001 ~ "Before 2000",
TRUE ~ "this shouldnt appear"
),.after = release_year)
Warning: Expected 180 pieces. Missing pieces filled with `NA` in 26564 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

steam_years %>%
filter(release_year >= 2009) %>%
group_by(release_year) %>%
count(value) %>%
slice_max(n = 2, n) %>%
arrange(desc(n)) %>%
ggplot(aes(x = release_year, y = n, fill = value)) +
geom_col(colour = "black") +
labs(
title = "Most Common Genres Over Time",
subtitle = "Shown year by year from 2009 until May 2019",
caption = "From 2019 Steam/Steam Spy data"
) +
scale_fill_manual(values = c(
"Indie" = "#04E762",
"Action" = "#DC0073",
"Casual" = "#008BF8",
"Adventure" = "#f5b700"
)) +
theme_light() +
theme(axis.text.x = element_text(angle = 25, hjust = 0.5, size = 9, face = "bold", vjust = 0.8)) +
theme(axis.line = element_line(colour = "black"))+
theme(axis.text.y = element_text(size = 9, face = "bold")) +
xlab("Interval") +
ylab("Number of Games in the genre released") +
theme(axis.title.x = element_text(face = "italic", size = 10)) +
theme(axis.title.y = element_text(face = "italic", size = 10)) +
theme(plot.caption = element_text(size = 7, hjust = 1.25)) +
scale_x_continuous(breaks = c(2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019)) +
labs(fill = "Genre")

backloggd_genres %>%
pivot_longer(cols = genre_tag:genre_tag_7, names_to = "list", values_to = "genre") %>%
drop_na(genre)
Error in pivot_longer(., cols = genre_tag:genre_tag_7, names_to = "list", :
object 'backloggd_genres' not found

We have established that Indie, Action and Adventure games seem to
be the most popular - But what examples of these are there?
---
title: "DAY 2"
output: html_notebook
---

```{r}
library(tidyverse)
library(janitor)

steam <- read_csv("raw_data/steam_checkpoint.csv")
```

Where was i?

```{r}
games_pre_cleaned <- read_csv("raw_data/Cleaned Data 2 GVGS&R.csv")
```

```{r}
games_pre_cleaned
```
How dare you do the work for me

```{r}
games_raw <- read_csv("raw_data/Raw Data GVGS&R.csv")
```

```{r}
games_raw %>% 
  filter(Name == "Minecraft")
```
Why does everyone have the same dataset

back to steam i guess

```{r}
steam
```

```{r}
steam %>% 
  distinct(genres)
```

Already have F2P covered by another column, so we can drop that in genres

```{r}
steam %>% 
  mutate(genres = str_remove_all(genres, "Free to Play")) %>% 
  filter(free_to_play == TRUE)
```

Is this worth doing?
Duplicated info doesn't hurt i suppose - leave it for now

single player?

```{r}
steam <- steam %>% 
  mutate(singleplayer = case_when(
    str_detect(categories, "Single-player") ~ TRUE,
    str_detect(steamspy_tags, "Singleplayer") ~ TRUE,
    TRUE ~ FALSE
  ),.before = multiplayer)
```

```{r}
steam %>% 
  filter(free_to_play == TRUE) 
```
```{r}
steam
```
```{r}
# adding column to simplify ratings 
steam <- steam %>% 
  mutate(total_reviews = positive_ratings +  negative_ratings) %>% 
  arrange(desc(total_reviews)) %>% 
  mutate(percent_positive_reviews = positive_ratings / total_reviews) %>% 
  mutate(general_rating = case_when(
    total_reviews > 1000 & percent_positive_reviews >= 0.95 ~ "Extremely Positive",
    total_reviews > 1000 & percent_positive_reviews >= 0.75 ~ "Positive",
    total_reviews > 1000 & percent_positive_reviews >= 0.55 ~ "Mostly Positive",
    total_reviews > 1000 & percent_positive_reviews >= 0.50 ~ "Mixed" ,
    total_reviews > 1000 & percent_positive_reviews >= 0.35 ~ " Mostly Negative",
    total_reviews > 1000 & percent_positive_reviews >= 0.15 ~ "Negative",
    total_reviews > 1000 & percent_positive_reviews < 0.15 ~ "Extremely Negative",
    positive_ratings == 0 ~ "No Positive Reviews",
    negative_ratings == 0 ~ "No Negative Reviews",
    positive_ratings & negative_ratings == 0 ~ "No Reviews",
    total_reviews < 1000 ~ "Not Enough Reviews",
    TRUE ~ "No Data"
  ),.before = positive_ratings) %>% 
  select(-percent_positive_reviews)
```



```{r}
backloggd <- read_csv("clean_data/backloggd_clean.csv")
```

```{r}
backloggd
```


```{r}
games_raw <- games_raw %>% 
  clean_names()
```

```{r}
# Platform isn't a genre - correcting
games_raw <- games_raw %>% 
  mutate(genre = case_when(
    genre == "Platform" ~ "Platformer",
    TRUE ~ genre
  ))
```

```{r}
nintendo_published <- games_raw %>% 
  filter(publisher == "Nintendo")
```

```{r}
genre_publisher_sales <- games_raw %>%  
  group_by(publisher, genre) %>% 
  summarise(sum(global_sales)) %>% 
  arrange(desc(`sum(global_sales)`))
```
```{r}
write_csv(genre_publisher_sales, "clean_data/genre_by_publisher.csv")
```

```{r}
genre_publisher_sales %>% 
  filter(publisher == "From Software")
```

```{r}
games_raw %>% 
  distinct(rating)
```


```{r}
# clarifying that the rating is for ESRP - the american system
games_raw <- games_raw %>% 
  rename("esrp" = "rating") 
```

```{r}
# Adding PEGI equivalent 
games_raw <- games_raw %>% 
  mutate(pegi_equivalent = case_when(
    esrp == "EC" ~ "3",
    esrp == "E" ~ "7",
    esrp == "E10+" ~ "12",
    esrp == "T" ~ "16",
    esrp == "M" ~ "18",
    esrp == "AO" ~ "18",
    esrp == "K-A" ~ "7",
    esrp == "RP" ~ "No Rating",
    TRUE ~ "No Rating Found"
  )) 
  
```

```{r}
total_sales_by_genre <- games_raw %>% 
  group_by(genre) %>% 
  summarise(sum(global_sales)) %>% 
  rename("total_sales" = "sum(global_sales)") %>% 
  arrange(desc(total_sales))
```

```{r}
total_sales_by_genre <- total_sales_by_genre %>% 
  drop_na()
```

```{r}
games_raw 
```

```{r}
breakdown_sales_by_genre <- games_raw %>% 
  group_by(genre) %>% 
  summarise(sum(na_sales), sum(eu_sales), sum(jp_sales), sum(other_sales)) %>% 
  rename("total_na_sales"= "sum(na_sales)", "total_eu_sales" = "sum(eu_sales)", "total_jp_sales" = "sum(jp_sales)", "total_other_sales" = "sum(other_sales)")
```

```{r}
total_sales_by_genre # this seems to have some rounding differences compared to breakdown, so i'll retire this one 
```
how does the sales for the top 3 genres look in each region?


```{r}
breakdown_sales_by_genre <- breakdown_sales_by_genre %>% 
  group_by(genre) %>% 
  mutate(total_sales = sum(total_na_sales + total_eu_sales + total_jp_sales + total_other_sales)) %>% 
  drop_na()
```
```{r}

breakdown_sales_by_genre %>% 
  filter(genre %in% c("Action", "Sports", "Shooter")) %>% 
  ggplot(aes(x = total_sales, y = genre))+
  geom_col() +
  geom_col(aes(x = total_na_sales), fill = "green")+
  geom_col(aes(x = total_eu_sales), fill = "red") +
  geom_col(aes(x = total_jp_sales), fill = "blue") 

# this looks shit
```

What about trends? How did each genres sales adjust every 5 or so years?

```{r}
games_raw %>% 
  filter(year_of_release == "2016")

games_raw %>% 
  mutate(year_of_release = as.integer(year_of_release)) %>% 
  distinct(year_of_release) %>% 
  arrange(year_of_release)

```

```{r}
sales_2019 <- read_csv("raw_data/sales-2019.csv")
```

```{r}
sales_2019 %>% 
  distinct(Year) %>% 
  arrange(desc(Year))
```
```{r}
sales_2019 %>% 
  filter(Year == "2019") %>% 
  select(Name, Year, Platform)

# data for 2020 is total nonsense. Some information in 2019 is incorrect.
```
# Going off the fact that there is the majority of usable information i am able to get is from 2019 or before, we're just going to pretend we're living in an 
# alternative 2019

As such, I am dropping all data from 2020 in this dataset 

```{r}
sales_2019 <- sales_2019 %>% 
  filter(Year != "2020")
```

# Looking at backloggd again

```{r}
backloggd
```
```{r}
# creating a range_rating column to reduce noise when it comes to plotting results
backlogged <- backloggd %>% 
  mutate(rating_range = case_when(
    rating < 5 & rating >= 4 ~ "4+",
    rating < 4 & rating >= 3 ~ "3 to 4",
    rating < 3 & rating >= 2 ~ "2 to 3",
    rating < 2 & rating >= 1 ~ "1 to 2",
    rating < 1 ~ ">1",
    TRUE ~ "No rating"
  ))
```

## DIFFICULTIES WITH THE DATA

After having little success attempting to get a more up to date data set, i've decided to stick with what I already have - Given more time, i would have liked
to have run the steam/steam spy script to scrape all the steam data, but unfortunately i couldn't commit 3 days to it.
There doesn't seem to be much in the way of modern sales data, with most companies not releasing figures unless its to do with a milestone.
Genres are quite vague - While it would be nice to have tags like "Open World", "Rouge-like/lite", we instead mostly have "Action" "Adventure", "Indie", which
are not hugely descriptive. This might make recommending a genre to work on harder

----------------------------------

# DAY 3

```{r}
sales_2019 %>% 
  summary()
```
All of the VGchartz scores are empty, so it can go. Also, same as above, there's no such genre as Platform

```{r}
sales_2019 <- sales_2019 %>% 
  clean_names() %>% 
  select(-vg_chartz_score) %>% 
  mutate(genre = case_when(
    genre == "Platform" ~ "Platformer",
    TRUE ~ genre
  ))
  
```

```{r}
games_raw
```

```{r}
sales_2019 %>% 
  group_by(name) %>% 
  summarise(sum(total_shipped)) %>% 
  arrange(desc(`sum(total_shipped)`)) 

# total shipped might not be the best metric for sales, but better than nothing
```

```{r}
sales_2019 %>% 
  distinct(year) %>% 
  arrange(desc(year))
```
```{r}
sales_2019 <- sales_2019 %>% 
  filter(year != 1970)
```

1970 is nonsense. dropping entire year

```{r}
# adding column to use for showing trends over time
sales_2019 <- sales_2019 %>% 
  mutate(five_year_period = case_when(
    year <= 2019 & year >= 2014 ~ "2014 - 2019",
    year <= 2013 & year >= 2008 ~ "2008 - 2013",
    year <= 2007 & year >= 2002 ~ "2002 - 2007",
    year <= 2001 & year >= 1996 ~ "1996 - 2001",
    year <= 1995 & year >= 1991 ~ "1991 - 1995",
    year <= 1990 & year >= 1985 ~ "1985 - 1990",
    year <= 1984 & year >= 1981 ~ "1981 - 1984",
    year < 1981 ~ "1980 and before",
    TRUE ~ "this shouldnt appear"
  ), .after = year)
```

```{r}
sales_2019 %>% 
  group_by(five_year_period) %>% 
  count(genre) %>% 
  slice_max(n = 2, n) %>% 
  ggplot(aes(x = five_year_period, y = n, fill = genre)) +
  geom_col(colour = "black") +
  labs(
    title = "Most Common Genres Over Time",
    subtitle = "Shown over 5 year intervals",
    caption = "From 2019 games sales data"
  ) +
  scale_fill_manual(values = c(
    "Platformer" = "#04E762",
    "Action" = "#DC0073",
    "Sports" = "#008BF8",
    "Misc" = "#f5b700",
    "Shooter" = "#390099"
  )) +
  theme_light() +
  theme(axis.text.x = element_text(angle = 25, hjust = 0.5, size = 9, face = "bold", vjust = 0.8)) +
  theme(axis.line = element_line(colour = "black"))+
  theme(axis.text.y = element_text(size = 9, face = "bold")) +
  xlab("Interval") +
  ylab("Number of Games in the genre released") +
  theme(axis.title.x = element_text(face = "italic", size = 10)) +
  theme(axis.title.y = element_text(face = "italic", size = 10)) +
  theme(plot.caption = element_text(size = 7, hjust = 1.25)) +
  labs(fill = "Genre")

# Misc isnt very descriptive or helpful - in terms of recommending making something to someone it means essentially "try anything"
```

```{r}

# PLOT SHOWING MOST COMMON GENRE OF GAMES - NOT NECESSARILY BEST SELLING
sales_2019 %>% 
  group_by(five_year_period) %>% 
  filter(genre != "Misc") %>% 
  count(genre) %>% 
  slice_max(n = 2, n) %>% 
  ggplot(aes(x = five_year_period, y = n, fill = genre)) +
  geom_col(colour = "black") +
  labs(
    title = "Most Common Genres Over Time",
    subtitle = "Shown over 5 year intervals - Excluding 'Misc' genre",
    caption = "From 2019 game sales data"
  ) +
   scale_fill_manual(values = c(
   "Platformer" = "#04E762",
    "Action" = "#DC0073",
    "Sports" = "#008BF8",
   # "Misc" = "#f5b700",
    "Shooter" = "#390099",
    "Adventure" = "#8ac926",
    "Role-Playing" = "#ff6b35"
   )) +
  theme_light() +
  theme(axis.text.x = element_text(angle = 25, hjust = 0.5, size = 9, face = "bold", vjust = 0.8)) +
  theme(axis.line = element_line(colour = "black"))+
  theme(axis.text.y = element_text(size = 9, face = "bold")) +
  xlab("Interval") +
  ylab("Number of Games in the genre released") +
  theme(axis.title.x = element_text(face = "italic", size = 10)) +
  theme(axis.title.y = element_text(face = "italic", size = 10)) +
  theme(plot.caption = element_text(size = 7, hjust = 1.25)) +
  labs(fill = "Genre")
         
```

```{r}
sales_2019 %>% 
  filter(five_year_period == "2019 - 2014") %>% 
  filter(genre == "Misc")
```
This is slightly concerning - a lot of these titles could easily have a genre that isn't the cure-all "Misc" - 
Include both graphs - one with misc, one excluding it

```{r}
games_raw %>% 
  filter(genre == "Misc")
```
What about a similar plot for steam games?

```{r}
steam
```
```{r}
steam_years <- steam %>% 
  mutate(release_year = year(release_date),.after = release_date)
```
```{r}
steam_years %>% 
  mutate(length_genre = nchar(genres),.after = name) %>% 
  arrange(desc(length_genre))
```

```{r}
steam_years %>% 
  distinct(release_year) %>% 
  arrange(desc(release_year))
```
```{r}
steam_years <- steam_years %>%  
  separate(col = genres, sep = ";", into = paste0("Genre", seq_len(max(nchar(.$genres))))) %>%  # this is absolutely horrendous, but it gets me all the genres into separate columns so whatever
  select(-Genre14:-Genre180) %>% 
  pivot_longer(cols = Genre1:Genre13, names_to = "genres") %>% 
  drop_na(value) %>% # ok now we can make the year intervals and group them to find what genres were popular 
  mutate(year_range = case_when(
    release_year <= 2019 & release_year >= 2016 ~ "2016 - 2019",
    release_year <= 2015 & release_year >= 2012 ~ "2012 - 2015",
    release_year <= 2011 & release_year >= 2009 ~ "2009 - 2011",
    release_year <= 2008 & release_year >= 2005 ~ "2005 - 2008",
    release_year <= 2004 & release_year>= 2000 ~ "2000 - 2004",
    release_year < 2001 ~ "Before 2000",
    TRUE ~ "this shouldnt appear"
  ),.after = release_year) 
```

```{r}
steam_years %>% 
  filter(year_range == "2000 - 2004")
```
```{r}
steam_years %>% 
  group_by(year_range) %>% 
  count(value) %>% 
  slice_max(n = 2, n) %>% 
  arrange(desc(n))  %>% 
  filter(year_range != "Before 2000") %>% 
  ggplot(aes(x = year_range, y = n, fill = value)) +
  geom_col(colour = "black") +
  labs(
    title = "Most Common Genres Over Time",
    subtitle = "Shown over 3-4 year intervals",
    caption = "From 2019 Steam/Steam Spy data"
  ) +
  scale_fill_manual(values = c(
    "Indie" = "#04E762",
    "Action" = "#DC0073"
  )) +
  theme_light() +
  theme(axis.text.x = element_text(angle = 25, hjust = 0.5, size = 9, face = "bold", vjust = 0.8)) +
  theme(axis.line = element_line(colour = "black"))+
  theme(axis.text.y = element_text(size = 9, face = "bold")) +
  xlab("Interval") +
  ylab("Number of Games in the genre released") +
  theme(axis.title.x = element_text(face = "italic", size = 10)) +
  theme(axis.title.y = element_text(face = "italic", size = 10)) +
  theme(plot.caption = element_text(size = 7, hjust = 1.25)) +
  labs(fill = "Genre")
```

```{r}
# Happy with this one as a comparison point
steam_years %>% 
  filter(release_year >= 2009) %>% 
  group_by(release_year) %>% 
  count(value) %>% 
  slice_max(n = 2, n) %>% 
  arrange(desc(n))  %>% 
  ggplot(aes(x = release_year, y = n, fill = value)) +
  geom_col(colour = "black") +
  labs(
    title = "Most Common Genres Over Time",
    subtitle = "Shown year by year from 2009 until May 2019",
    caption = "From 2019 Steam/Steam Spy data"
  ) +
   scale_fill_manual(values = c(
     "Indie" = "#04E762",
     "Action" = "#DC0073",
     "Casual" = "#008BF8",
     "Adventure" = "#f5b700"
   )) +
  theme_light() +
  theme(axis.text.x = element_text(angle = 25, hjust = 0.5, size = 9, face = "bold", vjust = 0.8)) +
  theme(axis.line = element_line(colour = "black"))+
  theme(axis.text.y = element_text(size = 9, face = "bold")) +
  xlab("Interval") +
  ylab("Number of Games in the genre released") +
  theme(axis.title.x = element_text(face = "italic", size = 10)) +
  theme(axis.title.y = element_text(face = "italic", size = 10)) +
  theme(plot.caption = element_text(size = 7, hjust = 1.25)) +
  scale_x_continuous(breaks = c(2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019)) +
  labs(fill = "Genre")
```


```{r}
steam_years %>% 
  filter(release_year == 2019)
```


```{r}
steam_years %>% 
  mutate(month = month(release_date)) %>% 
  filter(release_year == 2019) %>% 
  select(month) %>% 
  arrange(desc(month))
```
```{r}
backloggd
```

```{r}
backloggd_genres <- backloggd %>% 
  pivot_longer(cols = genre_tag:genre_tag_7, names_to = "list", values_to = "genre") %>% 
  drop_na(genre)
```

```{r}
# ok now i've definitely gotten rid of everything that hasnt released
backloggd_genres <- backloggd_genres %>% 
  mutate(release_year = year(release_date)) %>% 
  filter(-years_since_release < 0.01)
```

```{r}
backloggd_genres %>% 
  distinct(release_year) %>% 
  arrange(desc(release_year))
```
```{r}
backloggd_genres <- backloggd_genres %>%   
  mutate(intervals = case_when(
   release_year <= 2023 & release_year >= 2020 ~ "2020 - 2023",
    release_year <= 2019 & release_year >= 2014 ~ "2014 - 2019",
    release_year <= 2013 & release_year >= 2008 ~ "2008 - 2013",
    release_year  <= 2007 & release_year  >= 2002 ~ "2002 - 2007",
    release_year  <= 2001 & release_year  >= 1996 ~ "1996 - 2001",
    release_year  <= 1995 & release_year  >= 1991 ~ "1991 - 1995",
    release_year  <= 1990 & release_year  >= 1985 ~ "1985 - 1990",
    release_year  <= 1984 & release_year  >= 1981 ~ "1981 - 1984",
    release_year  < 1981 ~ "1980 and before",
    TRUE ~ "this shouldnt appear"
  )) 
```

```{r}
# This isn't showing the most popular genres in a timeframe, more of an insight into what people are playing
# Granted, Backloggd isn't the biggest service in the world, but it's a good idea of what people like, and still play
backloggd_genres %>% 
  mutate(genre = case_when(
    genre == "Platform" ~ "Platformer",
    TRUE ~ genre
  )) %>%
  filter(intervals != "1980 and before") %>% 
  filter(intervals != "1981 - 1984") %>% 
  group_by(intervals) %>% 
  count(genre) %>% 
  slice_max(n = 2, n) %>% 
  ggplot(aes(x = intervals, y = n, fill = genre)) +
  geom_col(colour = "black") +
   labs(
     title = "What do people still play?",
     subtitle = "Split into Genre and Release Year",
     caption = "From March 2023 Backloggd data"
   ) +
   scale_fill_manual(values = c(
   "Platformer" = "#04E762",
   #"Action" = "#DC0073",
   #"Sports" = "#008BF8",
   #"Misc" = "#f5b700",
   "Shooter" = "#390099",
   "Adventure" = "#8ac926",
   "RPG" = "#ff6b35"
   )) +
   theme_light() +
   theme(axis.text.x = element_text(angle = 25, hjust = 0.5, size = 9, face = "bold", vjust = 0.8)) +
   theme(axis.line = element_line(colour = "black"))+
   theme(axis.text.y = element_text(size = 9, face = "bold")) +
   xlab("Release window") +
   ylab("How many people have played ") +
   theme(axis.title.x = element_text(face = "italic", size = 10)) +
   theme(axis.title.y = element_text(face = "italic", size = 10)) +
   theme(plot.caption = element_text(size = 7, hjust = 1.25)) +
   labs(fill = "Genre")
```

# We have established that Indie, Action and Adventure games seem to be the most popular - But what examples of these are there?

